Statistics 506- Problem Set #5

Author

Garrett Pinkston

Problem 1 - OOP Programming

Create a class to represent rational numbers (numbers of the form for integers \(a\) and \(b\). Do this using S4.

a) For the rational class, define the following:

library(Rcpp)

# Define the Rational Class
setClass(
  "Rational",
  slots = c(numerator = "numeric", denominator = "numeric"),
  validity = function(object) {
    if (object@denominator == 0) {
      stop("Denominator cannot be 0.")
    }
    TRUE
  }
)
  1. A validator that ensures the denominator is non-zero. (Defined above in class)
# Constructor for the Rational Class
Rational <- function(numerator, denominator = 1) {
  new("Rational", numerator = numerator, denominator = denominator)
}
  1. A show method.
# Show Method
setMethod(
  "show",
  "Rational",
  function(object) {
    cat(sprintf("%d/%d\n", object@numerator, object@denominator))
  }
)
  1. A simplify method, to obtain the simplest form (e.g. simplify(2/4) produces 1/2).
# Simplify Method
setGeneric("simplify", function(object) standardGeneric("simplify"))
[1] "simplify"
setMethod(
  "simplify",
  "Rational",
  function(object) {
    gcd <- compute_gcd(object@numerator, object@denominator)
    object@numerator <- object@numerator / gcd
    object@denominator <- object@denominator / gcd
    return(object)
  }
)
  1. A quotient method (e.g. quotient(3/7) produces .42857143…). It should support a digits argument but only in the printing, not the returned result (Hint: what does print return?).
# Quotient Method
setGeneric("quotient", function(object, digits = 7) standardGeneric("quotient"))
[1] "quotient"
setMethod(
  "quotient",
  "Rational",
  function(object, digits = 7) {
    if (!is.numeric(digits) || digits != as.integer(digits) || digits < 0) {
      stop("Digits must be a non-negative integer.")
    }
    result <- object@numerator / object@denominator
    cat(round(result, digits), "\n")
    return(result)
  }
)
  1. Addition, subtraction, multiplication, division. These should all return a rational.
# Arithmetic Methods (+, -, *, /)
setMethod(
  "+",
  c("Rational", "Rational"),
  function(e1, e2) {
    lcm_den <- compute_lcm(e1@denominator, e2@denominator)
    new_num <- (e1@numerator * (lcm_den / e1@denominator)) +
               (e2@numerator * (lcm_den / e2@denominator))
    return(simplify(Rational(new_num, lcm_den)))
  }
)

setMethod(
  "-",
  c("Rational", "Rational"),
  function(e1, e2) {
    lcm_den <- compute_lcm(e1@denominator, e2@denominator)
    new_num <- (e1@numerator * (lcm_den / e1@denominator)) -
               (e2@numerator * (lcm_den / e2@denominator))
    return(simplify(Rational(new_num, lcm_den)))
  }
)

setMethod(
  "*",
  c("Rational", "Rational"),
  function(e1, e2) {
    return(simplify(Rational(e1@numerator * e2@numerator, e1@denominator * e2@denominator)))
  }
)

setMethod(
  "/",
  c("Rational", "Rational"),
  function(e1, e2) {
    if (e2@numerator == 0) {
      stop("Division by zero is not allowed.")
    }
    return(simplify(Rational(e1@numerator * e2@denominator, e1@denominator * e2@numerator)))
  }
)
  1. You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.
cppFunction("
  #include <numeric>`
  int compute_gcd(int a, int b) {
    return std::gcd(a, b);
  }")
cppFunction("
  #include <numeric>
  int compute_lcm(int a, int b) {
    return std::lcm(a, b);
  }")

b) Use your rational class to create three objects:

r1 <- Rational(24, 6)
r2 <- Rational(7, 230)
r3 <- Rational(0, 4)

Evaluate the following code (remember you can tell Quarto not to stop on errors):

r1
24/6
r3
0/4
r1 + r2
927/230
r1 - r2
913/230
r1 * r2
14/115
r1 / r2
920/7
r1 + r3
4/1
r1 * r3
0/1
r2 / r3
Error in r2/r3: Division by zero is not allowed.
quotient(r1)
4 
[1] 4
quotient(r2)
0.0304348 
[1] 0.03043478
quotient(r2, digits = 3)
0.03 
[1] 0.03043478
quotient(r2, digits = 3.14)
Error in quotient(r2, digits = 3.14): Digits must be a non-negative integer.
quotient(r2, digits = "avocado")
Error in quotient(r2, digits = "avocado"): Digits must be a non-negative integer.
q2 <- quotient(r2, digits = 3)
0.03 
q2
[1] 0.03043478
quotient(r3)
0 
[1] 0
simplify(r1)
4/1
simplify(r2)
7/230
simplify(r3)
0/1

c) Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.

# test validator and constructor with invalid inputs

# create a rational number with a zero denominator
tryCatch({
  r_invalid <- Rational(1, 0)
}, error = function(e) {
  cat("Error:", e$message, "\n")
})
Error: Denominator cannot be 0. 
# create a rational number with a non-numeric numerator
tryCatch({
  r_invalid <- Rational("not_a_number", 2)
}, error = function(e) {
  cat("Error:", e$message, "\n")
})
Error: invalid class "Rational" object: invalid object for slot "numerator" in class "Rational": got class "character", should be or extend class "numeric" 
# create a rational number with a non-numeric denominator
tryCatch({
  r_invalid <- Rational(3, "not_a_number")
}, error = function(e) {
  cat("Error:", e$message, "\n")
})
Error: invalid class "Rational" object: invalid object for slot "denominator" in class "Rational": got class "character", should be or extend class "numeric" 
# create a rational number with missing arguments
tryCatch({
  r_invalid <- Rational()
}, error = function(e) {
  cat("Error:", e$message, "\n")
})
Error: argument "numerator" is missing, with no default 
# create a rational number with only one argument (should default denominator to 1)
tryCatch({
  r_valid <- Rational(5)
  r_valid
}, error = function(e) {
  cat("Error:", e$message, "\n")
})
5/1

Problem 2 - plotly

# read in art data
art <- read.csv("/Users/garrettpinkston/Desktop/Michigan/STAT506/Data/df_for_ml_improved_new_market.csv")

Does the distribution of genre of sales across years appear to change?

# load in tidyr for data manipulation
library(tidyr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)

# reshape the 'art' dataset to a long format
df_long <- art %>%
  # used pivot_longer to transform all columns starting with "Genre___" into 
  # key-value pairs
  pivot_longer(cols = starts_with("Genre___"),
               names_to = "genre",       # New column for genre names
               values_to = "count") %>%   # New column for genre values
  # Filter when presence of a specific genre is in a sale
  filter(count == 1)

# visualize the yearly genre distribution
df_long %>%
  
  # group data by both year and genre
  group_by(year, genre) %>%
  
  # count occurrences for each genre per year
  summarize(count = n()) %>%
  
  # group data by year to compute the proportion of each genre
  group_by(year) %>%
  
  # calculate proportion of each genre's count relative to all other counts
  mutate(proportion = count / sum(count)) %>%
  
  # use ggplot, setting 'year' as x-axis and 'proportion' as y-axis
  ggplot(aes(x = year, y = proportion, fill = genre)) +
  
  # using a stacked bar plot to show proportions per genre each year
  geom_bar(stat = "identity", position = "fill") +
  
  # adding title, x-axis, y-axis, and legend labels
  labs(title = "Distribution of Genre Sales Across Years",
       x = "Year",
       y = "Proportion of Sales",
       fill = "Genre") +
  
  # center the plot title
  theme(plot.title = element_text(hjust = 0.5))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

# load libraries
library(dplyr)
library(tidyr)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# reshape data to include genres and calculate yearly averages
df_genre_price <- art %>%
  pivot_longer(cols = starts_with("Genre___"),
               names_to = "genre",
               values_to = "count") %>%
  filter(count == 1) %>% # Keep rows where the genre is present
  mutate(genre = str_remove(genre, "^Genre___")) %>% # Clean genre names
  group_by(year, genre) %>%
  summarize(avg_price = mean(price_usd, na.rm = TRUE), .groups = "drop") 

# find overall average sales price per year
df_overall_price <- art %>%
  group_by(year) %>%
  summarize(avg_price = mean(price_usd, na.rm = TRUE), .groups = "drop") %>%
  mutate(genre = "Overall")

# tally overall and genre-specific data
df_combined <- bind_rows(df_genre_price, df_overall_price)

# create an interactive plot
plotly_plot <- plot_ly(df_combined, 
                       x = ~year, 
                       y = ~avg_price, 
                       color = ~genre, 
                       type = 'scatter', 
                       mode = 'lines+markers',
                       line = list(width = 2),
                       marker = list(size = 6)) %>%
  layout(title = "Change in Sales Price Over Time by Genre",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Average Sales Price (USD)"),
         legend = list(title = list(text = "Genre")),
         hovermode = "x unified")

# display plot
plotly_plot

Problem 3 - data.table

Install and load the package nycflights13.

# load in data
library(nycflights13)

a) Generate a table (which can just be a nicely printed tibble) reporting the mean and median departure delay per airport. Generate a second table (which again can be a nicely printed tibble) reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights. Do this exclusion through code, not manually.

Additionally,

  • Order both tables in descending mean delay.
  • Both tables should use the airport names not the airport codes.
  • Both tables should print all rows.
# load the necessary libraries for data manipulation
library(nycflights13)  # contains flight data
library(data.table)    # for efficient data manipulation

Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':

    between, first, last
# convert the flights and airports data to data.table format
flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)

# part a: compute mean and median departure delays for airports with 10 or more flights
dep_stats <- flights_dt[
  # count the number of flights per destination airport
  , .(count = .N), by = .(dest)
][
  # filter to include only airports with at least 10 flights
  count >= 10
][
  # join back to the flights data on destination airport
  flights_dt, on = "dest"
][
  # calculate the mean and median departure delays per destination
  , .(
    mean_dep_delay = mean(dep_delay, na.rm = TRUE),   # average departure delay
    median_dep_delay = median(dep_delay, na.rm = TRUE) # median departure delay
  ), by = .(dest)
][
  # join with the airports data to get airport names
  airports_dt, on = .(dest = faa)
][
  # sort the results by mean departure delay in descending order
  order(-mean_dep_delay)
][
  # select only relevant columns: airport name, mean, and median delays
  , .(name, mean_dep_delay, median_dep_delay)
]

# part a: Compute mean and median arrival delays for airports with 10 or more flights
arr_stats <- flights_dt[
  # count the number of flights per destination airport
  , .(count = .N), by = .(dest)
][
  # filter to include only airports with at least 10 flights
  count >= 10
][
  # join back to the flights data on destination airport
  flights_dt, on = "dest"
][
  # calculate the mean and median arrival delays per destination
  , .(
    mean_arr_delay = mean(arr_delay, na.rm = TRUE),    # Average arrival delay
    median_arr_delay = median(arr_delay, na.rm = TRUE) # Median arrival delay
  ), by = .(dest)
][
  # join with the airports data to get airport names
  airports_dt, on = .(dest = faa)
][
  # sort the results by mean arrival delay in descending order
  order(-mean_arr_delay)
][
  # select only relevant columns: airport name, mean, and median delays
  , .(name, mean_arr_delay, median_arr_delay)
]

# print the results for departure and arrival delay statistics
print(dep_stats)
                           name mean_dep_delay median_dep_delay
                         <char>          <num>            <num>
   1:     Columbia Metropolitan       35.57009               14
   2:                Tulsa Intl       34.90635                8
   3:         Will Rogers World       30.56881               10
   4:           Birmingham Intl       29.69485                1
   5:             Mc Ghee Tyson       28.49396                0
  ---                                                          
1454:                Black Rock             NA               NA
1455:    New Haven Rail Station             NA               NA
1456: Wilmington Amtrak Station             NA               NA
1457:  Washington Union Station             NA               NA
1458:              Penn Station             NA               NA
print(arr_stats)
                           name mean_arr_delay median_arr_delay
                         <char>          <num>            <num>
   1:     Columbia Metropolitan       41.76415               28
   2:                Tulsa Intl       33.65986               14
   3:         Will Rogers World       30.61905               16
   4:      Jackson Hole Airport       28.09524               15
   5:             Mc Ghee Tyson       24.06920                2
  ---                                                          
1454:                Black Rock             NA               NA
1455:    New Haven Rail Station             NA               NA
1456: Wilmington Amtrak Station             NA               NA
1457:  Washington Union Station             NA               NA
1458:              Penn Station             NA               NA

b) How many flights did the aircraft model with the fastest average speed take? Produce a tibble with 1 row, and entires for the model, average speed (in MPH) and number of flights.

# load necessary libraries
library(nycflights13)
library(data.table)

# convert flights and planes datasets to data.table
flights_dt <- as.data.table(flights)
planes_dt <- as.data.table(planes)

# join flights with planes to include model information
flights_planes_dt <- flights_dt[
  !is.na(air_time) & !is.na(distance),  # exclude rows with missing air_time or distance
  ][planes_dt, on = "tailnum"]

# calculate speed and aggregate by model
model_speeds <- flights_planes_dt[
  , .(
    avg_speed_mph = mean(distance / (air_time / 60), na.rm = TRUE),  # calculate average speed
    num_flights = .N  # count number of flights
  ), by = model
]

# find the model with the fastest average speed
fastest_model <- model_speeds[
  avg_speed_mph == max(avg_speed_mph, na.rm = TRUE)
]

# print the result
print(fastest_model)
     model avg_speed_mph num_flights
    <char>         <num>       <int>
1: 777-222      482.6254           4